Reading the dataset downloaded from the Yelp Dataset Challenge
#Reading the dataset downloaded from the Yelp Dataset Challenge
library(readr)
business = read_csv("yelp_academic_dataset_business.csv")
checkin = read_csv("yelp_academic_dataset_checkin.csv")
reviews = read_csv("yelp_academic_dataset_review.csv")
## Warning: 3 parsing failures.
## row col expected actual
## 975201 NA 10 columns 5 columns
## 975202 date date like %Y%.%m%.%d review
## 975202 NA 10 columns 6 columns
user = read_csv("yelp_academic_dataset_user.csv")
filter = business[grep("u,',R,e,s,t,a,u,r,a,n,t,s,'",business$categories),]
restaurants = merge(filter,reviews,by = "business_id")
Plot the business distribution across the globe. 10 cities across 4 countries
#Plot countries in map
library(maps)
## Warning: package 'maps' was built under R version 3.2.3
##
## # maps v3.1: updated 'world': all lakes moved to separate new #
## # 'lakes' database. Type '?world' or 'news(package="maps")'. #
map("world", fill=TRUE, col="white", bg="lightblue", ylim=c(-60, 90), mar=c(0,0,0,0))
points(business$longitude,business$latitude, col="red", pch=16)

Distribution of ratings across restaurants
#Plot for star rating disturibution
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
rating_distribution = count(filter, "stars")
plot_rating = t(matrix(rating_distribution$freq))
colnames(plot_rating) = rating_distribution$stars
par(bg="white")
barplot(plot_rating,
main=expression(" Rating Distribution"),
xlab="Rating",
ylab="Number of restaurants",
col ="black", lwd=3,
col.lab="blue",
cex.lab=1,cex.lab = 1.25,axes=TRUE)

Mood Scoring of rating 3
# Function for clean up
cleaning_corpus = function(text) {
review_corpus = Corpus(VectorSource(text))
review_corpus = tm_map(review_corpus,tolower)
review_corpus = tm_map(review_corpus,stemDocument)
review_corpus = tm_map(review_corpus,removeWords,c(stopwords("english"),"can","also","let","see","way","got","just","get"))
review_corpus = tm_map(review_corpus,removePunctuation)
review_corpus = tm_map(review_corpus,removeNumbers)
review_corpus = tm_map(review_corpus,PlainTextDocument)
result = review_corpus
}
## To get only Restaurants data having rating = 3
data_b = business[,c("business_id","categories","stars","review_count")]
data_b = subset(data_b,stars ==3)
data_b = data_b[grep("u,',R,e,s,t,a,u,r,a,n,t,s,'",data_b$categories),]
## Filtering only rating, business id and text columns from reviews data frame for mood scoring
data_r = reviews[,c("business_id","stars","text")]
## filtering only those reviews that have 3 rating
data_r = subset(data_r,stars==3)
## Combining business and reviews data sets together for mood scoring of restaurants with rating = 3
br_data = merge(data_b,data_r,by = "business_id")
reviews_3r = br_data$text
## Text Cleaning before Mood Scoring
library(tm)
## Loading required package: NLP
library(stringr)
ctext = cleaning_corpus(reviews_3r)
text = NULL
for (j in 1:length(ctext)) {
temp = ctext[[j]]$content
if (temp!="") { text = c(text,temp) }
}
text = as.array(text)
#MOOD SCORING USING HARVARD INQUIRER
#Read in the Harvard Inquirer Dictionary
#And create a list of positive and negative words
HIDict = readLines("inqdict.txt")
dict_pos = HIDict[grep("Pos",HIDict)]
poswords = NULL
for (s in dict_pos) {
s = strsplit(s,"#")[[1]][1]
poswords = c(poswords,strsplit(s," ")[[1]][1])
}
dict_neg = HIDict[grep("Neg",HIDict)]
negwords = NULL
for (s in dict_neg) {
s = strsplit(s,"#")[[1]][1]
negwords = c(negwords,strsplit(s," ")[[1]][1])
}
poswords = tolower(poswords)
negwords = tolower(negwords)
poswords = unique(poswords)
negwords = unique(negwords)
text = str_replace_all(text,"nbsp"," ")
text = unlist(strsplit(text," "))
posmatch = match(text,poswords)
numposmatch = length(posmatch[which(posmatch>0)])
negmatch = match(text,negwords)
numnegmatch = length(negmatch[which(negmatch>0)])
labels = print(c(numposmatch,numnegmatch))
## [1] 213824 132874
names(labels) = c("POSITIVE","NEGATIVE")
library(plotrix)
## Warning: package 'plotrix' was built under R version 3.2.3
pie3D(c(numposmatch,numnegmatch),labels = names(labels),col= c("green","red"),explode= 0.1,main="Pie Chart of Mood Scoring of Rating 3")

Plotting the Word Cloud of reviews in each rating
# Word cloud for 10000 reviews per rating
# Function to do this computation
library(wordcloud)
## Loading required package: RColorBrewer
word_cloud = function(rating) {
sub = subset(restaurants, restaurants$stars.y == rating)
text = sub[sample(nrow(sub),10000),]
review_corpus = cleaning_corpus(text$text)
tdm = TermDocumentMatrix(review_corpus,control=list(minWordLength=4))
tdm2 = as.matrix(tdm)
wordcount = sort(rowSums(tdm2),decreasing=TRUE)
tdm_names = names(wordcount)
wordcloud(tdm_names,wordcount,scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
}
word_cloud(1)

word_cloud(2)

word_cloud(3)

word_cloud(4)

word_cloud(5)

Quality of Reviews
Plotting the Average WordCount of Reviews across Rating
# Average Word count vs Rating
library(stringi)
data = restaurants[,c("text","stars.y","votes_useful")]
data$count = stri_count(data$text,regex="\\S+")
count_plot = aggregate(data[, 4], list(data$stars.y), mean)
plot(count_plot$Group.1,count_plot$x,
main=expression(" Average Word Count Across Ratings"),type ="o",
xlab="Rating",
ylab="Average Word Count",
col ="green", lwd=3,
col.lab="blue",
cex.lab=1,cex.lab = 1.25,axes=TRUE)

WordCloud of top useful 25 reviews
# Wordcloud of top 25 useful ratings
data = head(restaurants[order(restaurants$votes_useful, decreasing = T),], n = 25)
top_data = data$text
ctext = cleaning_corpus(top_data)
tdm = TermDocumentMatrix(ctext,control=list(minWordLength=1))
tdm2 = as.matrix(tdm)
wordcount = sort(rowSums(tdm2),decreasing=TRUE)
tdm_names = names(wordcount)
wordcloud(tdm_names,wordcount, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))

Readability Calculation focusing on Gunning Fog Index
# Select 500 reviews from each rating and then plot readability
# Function to compute readability
library(koRpus)
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
##
## tokenize
compute_readability = function(rating,textfile) {
review = restaurants[sample(which (restaurants$stars.y == rating), 500),]
reviewslist <- as.list(review$text)
lapply(reviewslist,write, textfile,append = TRUE)
l2.tagged = lapply(textfile, tokenize, lang="en")
l2.readability <- lapply(l2.tagged,readability)
# l2.readability (This prints all the readability indices)
}
#This is for calling the readability function. We are commenting this since it takes lot of time for knitting.
#compute_readability(1,"readability_1.txt")
#compute_readability(2,"readability_2.txt")
#compute_readability(3,"readability_3.txt")
#compute_readability(4,"readability_4.txt")
#compute_readability(5,"readability_5.txt")
# We take the Gunning Fog out of each of the above computed readability and then manually put in a dataframe
#Plotting Gunning Fog Index vs Rating
fogindex = data.frame("rating"=c(1,2,3,4,5),"index"=c(8.36,8.73,8.89,8.64,8.79))
plot(fogindex$rating,
fogindex$index,
main=expression("Average Review Readability Across Ratings"),
xlab="Rating",
ylab="Average_Fog_Index",
type="o",col ="green", lwd=3,
col.lab="blue",
cex.lab=1,cex.lab = 1.25)

Linear Regression to determine the factors which affect a restaurant’s rating
#Linear Regression to predict what affects restaurant's rating
model = lm(filter$stars ~ filter$`attributes_Accepts Credit Cards` + filter$attributes_Alcohol + filter$attributes_Attire + filter$attributes_Caters + filter$attributes_Delivery + filter$`attributes_Drive-Thru`+ filter$`attributes_Good For Dancing` + filter$`attributes_Good For Groups` + filter$`attributes_Happy Hour` + filter$attributes_Smoking + filter$`attributes_Wi-Fi` + filter$review_count)
summary (model)
##
## Call:
## lm(formula = filter$stars ~ filter$`attributes_Accepts Credit Cards` +
## filter$attributes_Alcohol + filter$attributes_Attire + filter$attributes_Caters +
## filter$attributes_Delivery + filter$`attributes_Drive-Thru` +
## filter$`attributes_Good For Dancing` + filter$`attributes_Good For Groups` +
## filter$`attributes_Happy Hour` + filter$attributes_Smoking +
## filter$`attributes_Wi-Fi` + filter$review_count)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3298 -0.3624 0.0000 0.3228 1.1470
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 4.7883300 0.6694335 7.153
## filter$`attributes_Accepts Credit Cards`True -0.2410549 0.3249049 -0.742
## filter$attributes_Alcoholfull_bar -0.4624818 0.5587093 -0.828
## filter$attributes_Alcoholnone -0.2238488 0.8096041 -0.276
## filter$attributes_Attiredressy 0.2279444 0.3888094 0.586
## filter$attributes_CatersTrue 0.0211298 0.1420302 0.149
## filter$attributes_DeliveryTrue -0.3354632 0.5527995 -0.607
## filter$`attributes_Drive-Thru`True 0.1774006 0.5531987 0.321
## filter$`attributes_Good For Dancing`True -0.2104050 0.1804482 -1.166
## filter$`attributes_Good For Groups`True -0.3106324 0.3231334 -0.961
## filter$`attributes_Happy Hour`True -0.2614832 0.2109624 -1.239
## filter$attributes_Smokingoutdoor -0.1936269 0.1346372 -1.438
## filter$attributes_Smokingyes -0.1547730 0.3010814 -0.514
## filter$`attributes_Wi-Fi`no -0.0190040 0.1402631 -0.135
## filter$`attributes_Wi-Fi`paid -0.8257045 0.5530938 -1.493
## filter$review_count 0.0004436 0.0002485 1.785
## Pr(>|t|)
## (Intercept) 5.1e-10 ***
## filter$`attributes_Accepts Credit Cards`True 0.4605
## filter$attributes_Alcoholfull_bar 0.4105
## filter$attributes_Alcoholnone 0.7829
## filter$attributes_Attiredressy 0.5595
## filter$attributes_CatersTrue 0.8821
## filter$attributes_DeliveryTrue 0.5458
## filter$`attributes_Drive-Thru`True 0.7494
## filter$`attributes_Good For Dancing`True 0.2474
## filter$`attributes_Good For Groups`True 0.3395
## filter$`attributes_Happy Hour`True 0.2191
## filter$attributes_Smokingoutdoor 0.1546
## filter$attributes_Smokingyes 0.6087
## filter$`attributes_Wi-Fi`no 0.8926
## filter$`attributes_Wi-Fi`paid 0.1397
## filter$review_count 0.0784 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5385 on 74 degrees of freedom
## (21802 observations deleted due to missingness)
## Multiple R-squared: 0.2261, Adjusted R-squared: 0.06921
## F-statistic: 1.441 on 15 and 74 DF, p-value: 0.1513
Predict a rating based on reviews with Classification models SVM, GLMNET, MAXENT. We train a data of 9000 reviews and test around 200 reviews. We then plot the results
# Predict a rating based on review with Classification models
library("RTextTools")
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
dataset = restaurants[,c("text","stars.y","votes_useful")]
dataset$count = stri_count(dataset$text,regex="\\S+")
df = dataset[sample(nrow(dataset),9000),]
## Text cleaning within the create_matrix function
dtMatrix = create_matrix(df$text, language="english", removeNumbers=TRUE,stemWords=TRUE)
container = create_container(dtMatrix,cbind(df$stars,df$count,df$votes_useful),trainSize=1:8800,testSize= 8801:9000, virgin=FALSE)
low_model <- train_models(container, c("SVM","GLMNET","MAXENT"))
data_models = classify_models(container,low_model)
analytics = create_analytics(container,data_models)
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.00 0.52
## n >= 2 0.90 0.54
## n >= 3 0.39 0.68
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.564 0.524 0.532
## GLMNET_PRECISION GLMNET_RECALL GLMNET_FSCORE
## 0.420 0.380 0.376
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.418 0.424 0.418
# CREATE THE data.frame SUMMARIES
topic_summary <- analytics@label_summary
alg_summary <- analytics@algorithm_summary
ens_summary <-analytics@ensemble_summary
doc_summary <- analytics@document_summary
#Plotting the above values
top = doc_summary
plotdata = top[,c("SVM_PROB","GLMNET_PROB","MAXENTROPY_PROB")]
plotdata$REVIEW_NO = seq.int(nrow(plotdata))
library(reshape)
##
## Attaching package: 'reshape'
## The following objects are masked from 'package:plyr':
##
## rename, round_any
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
df <- melt(plotdata , id.vars = 'REVIEW_NO')
names(df) =c("Review_no","Prediction_Models","value")
ggplot(df, aes(x = Review_no, y = value, colour = Prediction_Models)) +
geom_line(size =0.5) + ylab(label="Prediction Probability") +
xlab("Review Number") +
scale_colour_manual(values=c("green", "blue","red"))

Plotting a WordCloud of packages used
# For showing the packages as a word cloud
packages = c(14,13,12,13,12,12,11,10,10,10,10)
class(packages)
## [1] "numeric"
plot_rating = t(matrix(packages))
names(packages) = c("readr","korpus","Rtexttools","maps","plyr","stringi","plotrix","ggplot2","tm","wordcloud","reshape")
tdm_names = names(packages)
wordcloud(tdm_names,packages,scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
